Setup

rm(list = ls(all.names = TRUE))

if(!require("pacman")) install.packages("pacman"); library(pacman)

pacman::p_load(
  tidyverse,
  scales,
  janitor,
  GCLr,
  coin
)

knitr::opts_chunk$set(fig.width = 10)
knitr::opts_chunk$set(echo = TRUE)

.username = readLines("~/usr_pw.txt", n = 1)  # LOKI username
.password = readLines("~/usr_pw.txt" , n = 2)[[2]]  # LOKI password

Objective

The purpose of this notebook is to provide Wes Larson (NOAA ABL) with adaptive run timing genotypes (9 loci) for AHRP chum samples. Filter out plates undergoing QC crosschecks due to poor genotyping success. Wes is interested in representation across years, less across multiple streams. He wants to see how the patterns hold up over time.

Steps

  • import all existing genotypes for CM064-CM070
  • filter out untrustworthy plates that need QC cross checks
  • standard GT-seq genotype QA (missing, heterozygosity, duplicate)
  • join sample date and spawning state <“../OceanAK/AHRP Salmon Biological Data 20240105_122404.368459_with_spawning_state.csv”>
  • filter for 9 adaptive run timing loci
  • export genotypes with date/spawning state

Background

An adaptive marker panel associated with run timing variation in chum salmon was developed by NOAA ABL and used to genotype two plates of samples from the ADFG Alaska Hatchery Research Program (AHRP) study being conducted within SE Alaska. AHRP samples (Fish Creek - Douglas Island 2023 + Prospect Creek 2017/2018) were genotyped in August 2024 for 20 single nucleotide polymorphisms and weighted linear regressions were performed to assess the association of different SNPs with run timing (date of collection). The chum salmon adaptive marker panel contains 20 loci spread among 5 chromosomes and was developed from variation detected between Yukon River summer (Gisasa River) and fall (Pelly River) run chum salmon with low coverage whole genome sequencing.These populations spawn approximately two months and over 1500 river km apart. NOAA targeted the major peaks in differentiation with particular interest in markers within the genes leucine rich repeat containing nine (LRRC9; LG35) and estrogen receptor β (ESRB; LG29) as they were associated with return timing in other salmonid species. Eight SNPs (four on LG29 and four on LG35) appear to be associated with run timing within these collections. The strength of association for AHRP samples appears to vary by population. No markers on LGs 5, 23 or 15 appear to have a strong association with run timing within these collections. Overall, the results from LG29 and LG35 are promising and variation in these SNPs should be surveyed for a larger collection of samples.

A total of 9 adaptive, run timing loci were added to the AHRP SEAK Chum parentage panel:

  • OkeV2_LG23_10546237
  • OkeV2_LG29_25450752
  • OkeV2_LG29_25455833
  • OkeV2_LG29_25483595
  • OkeV2_LG29_25515161
  • OkeV2_LG35_28078867
  • OkeV2_LG35_28128687
  • OkeV2_LG35_28165076
  • OkeV2_LG35_28167172

All AHRP Chum samples from Fish Creek - Douglas Island, Prospect Creek, Admiralty Creek, and Sawmill Creek were genotyped for the parentage panel at the ADF&G Gene Conservation Laboratory (GCL) in Winter/Spring 2025. Due to poor tissue quality, there was a high proportion of samples that failed to genotype and/or appear to be contaminated (high heterozygosity), resulting in low QC power for several plates. QC crosschecks are currently underway at the GCL in July 2025, however, Wes wants whatever genotypes we have on hand now so will just filter out those plates.

Import Data

LocusControl

Create objects, create LocusControl for Chum_AHRP_257_v1.3.0.

fs::dir_create("objects")
GCLr::create_locuscontrol(markersuite = "Chum_AHRP_257_v1.3.0",
                          username = .username,
                          password = .password)
GCLr::save_objects(objects = "LocusControl", path = "objects")

Hrmm, only 251 loci. Confirm that these are the same loci run in the most recent AHRP Chum project CM069

probe_loci_CM069 <- readr::read_tsv(
  file = "V:/Lab/Genotyping/SNP Projects/Chum/Project CM069 AHRP Parentage/Genotype Data Files/set2rr_CM069_outputs/probes.txt", 
  show_col_types = FALSE) %>%
  dplyr::distinct(`#Locus_ID`) %>%
  dplyr::pull()

all.equal(probe_loci_CM069, LocusControl$locusnames)

Yes, all are the same.

loci251 <- LocusControl$locusnames
GCLr::save_objects(objects = "loci251", path = "objects")

Genotypes

Read in genotypes by projec to get sillyvec.

rm(LocusControl)
# GCLr::loki2r_proj(project_name = paste0("CM0", 64:70), username = .username, password = .password)  # comment out to avoid re-running

What collections?

(sillyvec <- sort(project_sillys))

Subset sillyvec to remove alevin collection

(sillyvec <- grep(pattern = "a", x = sillyvec, ignore.case = FALSE, value = TRUE, invert = TRUE))

Wipe out .gcl objects and re-read in using loki2r.

GCLr::save_objects(objects = "sillyvec", path = "objects")

rm(list = objects(pattern = ".gcl"))
rm(LocusControl)

Read in genotypes.

GCLr::load_objects(path = "objects", pattern = "LocusControl")

GCLr::loki2r(sillyvec = sillyvec,
             username = .username,
             password = .password, 
             test_type = "GTSNP")

Only took 5.39 minutes to read everything in at the office!

Save raw genotypes (useful for working offsite and off VPN).

fs::dir_create("data/genotypes")
fs::dir_create("data/genotypes/raw")
# GCLr::save_sillys(sillyvec = sillyvec, path = "data/genotypes/raw", rds = TRUE)  # hide so you don't overwrite on accident!

Re-load

Re-load, if necessary

GCLr::load_objects(path = "objects")
[1] "loci251"      "LocusControl" "sillyvec"    
GCLr::load_sillys(sillyvec = sillyvec, path = "data/genotypes/raw", rds = TRUE)
 [1] "CMADMCR13.gcl"   "CMADMCR14.gcl"   "CMADMCR17.gcl"   "CMADMCR18.gcl"   "CMFISHCR13.gcl"  "CMFISHCR14.gcl"  "CMFISHCR17.gcl"  "CMFISHCR18.gcl" 
 [9] "CMFISHCR19.gcl"  "CMFISHCR20.gcl"  "CMFISHCR21.gcl"  "CMFISHCR22.gcl"  "CMFISHCR23.gcl"  "CMFISHCRT13.gcl" "CMFISHCRT21.gcl" "CMFISHCRT22.gcl"
[17] "CMFISHCRT23.gcl" "CMPROSCR13.gcl"  "CMPROSCR14.gcl"  "CMPROSCR17.gcl"  "CMPROSCR18.gcl"  "CMPROSCR19.gcl"  "CMPROSCR20.gcl"  "CMPROSCR21.gcl" 
[25] "CMPROSCR22.gcl"  "CMPROSCRT21.gcl" "CMPROSCRT22.gcl" "CMSAWCR13.gcl"   "CMSAWCR14.gcl"   "CMSAWCR15.gcl"   "CMSAWCR17.gcl"   "CMSAWCR18.gcl"  
[33] "CMSAWCR19.gcl"   "CMSAWCR20.gcl"   "CMSAWCR21.gcl"   "CMSAWCR22.gcl"   "CMSAWCRT21.gcl"  "CMSAWCRT22.gcl" 

Metadata

Read in paired metadata from Salmon Biological Fact and Stream Specimens, format similar to pws_pink_ped_dat. NOTE I’m doing this in a bit of a hurry, so make sure to proof read carefully if recycling this code!

(metadata <- readr::read_csv(file = "../OceanAK/AHRP Salmon Biological Data 20240105_122404.368459_with_spawning_state.csv"))
Rows: 27046 Columns: 25
── Column specification ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr  (10): silly_code, spawning_state, sex, tissue_type, dna_tray_code, location_code, sample_id, otolith_mark_present, otolith_mark_id, otolith_...
dbl   (9): collection_id, fish_id, length_mm, dna_tray_well_code, sample_year, is_missing_paired_data_exists, well_has_more_than_one_sample, fw_a...
lgl   (5): target_dna_tray_code, target_dna_tray_well_pos, target_container_array_type_id, container_array_type, determination_collection_id
date  (1): sample_date

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Modify

(
  metadata <- metadata %>% 
    dplyr::select(-dplyr::contains("target")) %>% 
    dplyr::select(-dplyr::contains("determination")) %>% 
    dplyr::rename(date = sample_date,
                  length = length_mm,
                  silly = silly_code) %>% 
    tidyr::unite(col = "sample", dna_tray_code:dna_tray_well_code, sep = "_", remove = FALSE) %>% 
    tidyr::unite(col = "silly_source", c(silly, fish_id), sep = "_", remove = FALSE) %>% 
    dplyr::mutate(franz_id = stringr::str_c(stringr::str_sub(silly, 1, 2),  # 1st letter of species name, 1st letter of stream name
                                            stringr::str_sub(silly, -2, -1),  # 2 digit sample year
                                            "_",
                                            stringr::str_pad(fish_id, width = 5, pad = "0", side = "left")),  # 5 digit fishid using padded zeros up front)
                  year = lubridate::year(date),
                  stream = stringr::str_remove(string = location_code, pattern = " Creek"),
                  DOY = lubridate::yday(date),  # day of year
                  sex = dplyr::case_when(
                    sex == "M" ~ "Male",
                    sex == "F" ~ "Female",
                    sex == "U" ~ NA_character_,
                    is.na(sex) ~ NA_character_),
                  origin = dplyr::case_when(
                    otolith_mark_present == "NO" ~ "Natural",
                    otolith_mark_present == "YES" ~ "Hatchery"
                  ),  # add origin variable
                  origin = base::factor(origin, levels = c("Natural", "Hatchery")),  # make factor to ensure hatchery != red
                  spawning_state = dplyr::case_when(
                    spawning_state == "Unknown" ~ NA_character_,
                    TRUE ~ spawning_state
                  )  # force Unknown to NA
    ) %>%  
    dplyr::select(
      franz_id,  # unique identifier for FRANz (e.g. PE16_00001, species, stream, year, GCL fish_id)
      sample,  # ultimate data key for AHRP, DWP barcode + DWP position
      silly_source,  # unique identifier for LOKI (GCL database)
      stream,  # 5 pedigree streams
      year,  # sample year (aka death year for FRANz)
      origin,  # hatchery or wild
      # hatchery,  # which hatchery
      sex,  # male, female, unknown
      date,  # sample date
      DOY,  # day of year (aka julian sample date)
      length,  # mideye to hypural plate length in mm
      # intertidal,  # sampling location above or below intertidal
      # distance_mouth,  # riverdist approximate sampling location from mouth in meters 
      spawning_state,  # alive, pink gill, grey gill, rotting (aka index of stream life)
      # pre_spawn,  # TRUE/FALSE (added in 2016); if the eggs and milt do not easily release and carcass has completely full gonads
      # partial_spawn,  # TRUE/FALSE (added in 2015); fish is sampled (dead or alive) with more than a little eggs or milt
      # preyed_upon,  # TRUE/FALSE (added in 2015); gonads removed by predators
      # stream_trib,  # stream tributary (Gilmour, Paddy, and Stockdale have multiple)
      # latitude,  # sampling area (where fish were moved to prior to sampling, rough approximate of spawning location)
      # longitude,  # sampling area (where fish were moved to prior to sampling, rough approximate of spawning location)
      otolith_mark_present,  # otolith mark present (yes = hatchery, no = natural)
      otolith_mark_id,  # otolith hatchery mark
      otolith_mark_status_code,  # otolith read status code (reason for no read = overground, lost, etc.)
      silly,  # GCL collection code
      fish_id,  # GCL fish ID within a silly
      dna_tray_code,  # 10-digit DWP barcode
      dna_tray_well_code,  # 48 DWP position 1:48
      # dna_tray_well_pos,  # 48 DWP position A1:H6
      # distance_tide,  # riverdist approximate sampling location from maximum high tide
      # riverdist_seg,  # riverdist river segment
      # riverdist_vert,  # riverdist vertex
      # riverdist_snapdist,  # riverdist snapping distance (aka how far was sampling location lat/long from the stream polyline)
      # high_tide  # highest extent of high tide in meters from mouth, unique to each stream
      fw_age,
      sw_age
    )  
)       

Sample Sizes

Verify that all genotyped samples have metadata.

geno_silly_source <- sapply(sillyvec, function(silly) {get(paste0(silly, ".gcl"))$SillySource}) %>% unlist()

table(geno_silly_source %in% metadata$silly_source)

FALSE  TRUE 
  241 18569 

Shoot, which samples do not have any metadata?

Ah, I bet they are the Fish Creek tag samples from 2013 that Tyler Dann and Heather Liller collected!

setdiff(geno_silly_source, metadata$silly_source)
  [1] "CMFISHCRT13_1"   "CMFISHCRT13_2"   "CMFISHCRT13_3"   "CMFISHCRT13_4"   "CMFISHCRT13_5"   "CMFISHCRT13_6"   "CMFISHCRT13_7"   "CMFISHCRT13_8"  
  [9] "CMFISHCRT13_9"   "CMFISHCRT13_10"  "CMFISHCRT13_11"  "CMFISHCRT13_12"  "CMFISHCRT13_13"  "CMFISHCRT13_14"  "CMFISHCRT13_15"  "CMFISHCRT13_16" 
 [17] "CMFISHCRT13_17"  "CMFISHCRT13_18"  "CMFISHCRT13_19"  "CMFISHCRT13_20"  "CMFISHCRT13_21"  "CMFISHCRT13_22"  "CMFISHCRT13_23"  "CMFISHCRT13_24" 
 [25] "CMFISHCRT13_25"  "CMFISHCRT13_26"  "CMFISHCRT13_27"  "CMFISHCRT13_28"  "CMFISHCRT13_29"  "CMFISHCRT13_30"  "CMFISHCRT13_31"  "CMFISHCRT13_32" 
 [33] "CMFISHCRT13_33"  "CMFISHCRT13_34"  "CMFISHCRT13_35"  "CMFISHCRT13_36"  "CMFISHCRT13_37"  "CMFISHCRT13_38"  "CMFISHCRT13_39"  "CMFISHCRT13_40" 
 [41] "CMFISHCRT13_41"  "CMFISHCRT13_42"  "CMFISHCRT13_43"  "CMFISHCRT13_44"  "CMFISHCRT13_45"  "CMFISHCRT13_46"  "CMFISHCRT13_47"  "CMFISHCRT13_48" 
 [49] "CMFISHCRT13_49"  "CMFISHCRT13_50"  "CMFISHCRT13_51"  "CMFISHCRT13_52"  "CMFISHCRT13_53"  "CMFISHCRT13_54"  "CMFISHCRT13_55"  "CMFISHCRT13_56" 
 [57] "CMFISHCRT13_57"  "CMFISHCRT13_58"  "CMFISHCRT13_59"  "CMFISHCRT13_60"  "CMFISHCRT13_61"  "CMFISHCRT13_62"  "CMFISHCRT13_63"  "CMFISHCRT13_64" 
 [65] "CMFISHCRT13_65"  "CMFISHCRT13_66"  "CMFISHCRT13_67"  "CMFISHCRT13_68"  "CMFISHCRT13_69"  "CMFISHCRT13_70"  "CMFISHCRT13_71"  "CMFISHCRT13_72" 
 [73] "CMFISHCRT13_73"  "CMFISHCRT13_74"  "CMFISHCRT13_75"  "CMFISHCRT13_76"  "CMFISHCRT13_77"  "CMFISHCRT13_78"  "CMFISHCRT13_79"  "CMFISHCRT13_80" 
 [81] "CMFISHCRT13_81"  "CMFISHCRT13_82"  "CMFISHCRT13_83"  "CMFISHCRT13_84"  "CMFISHCRT13_85"  "CMFISHCRT13_86"  "CMFISHCRT13_87"  "CMFISHCRT13_88" 
 [89] "CMFISHCRT13_89"  "CMFISHCRT13_90"  "CMFISHCRT13_91"  "CMFISHCRT13_92"  "CMFISHCRT13_93"  "CMFISHCRT13_94"  "CMFISHCRT13_95"  "CMFISHCRT13_96" 
 [97] "CMFISHCRT13_97"  "CMFISHCRT13_98"  "CMFISHCRT13_99"  "CMFISHCRT13_100" "CMFISHCRT13_101" "CMFISHCRT13_102" "CMFISHCRT13_103" "CMFISHCRT13_104"
[105] "CMFISHCRT13_105" "CMFISHCRT13_106" "CMFISHCRT13_107" "CMFISHCRT13_108" "CMFISHCRT13_109" "CMFISHCRT13_110" "CMFISHCRT13_111" "CMFISHCRT13_112"
[113] "CMFISHCRT13_113" "CMFISHCRT13_114" "CMFISHCRT13_115" "CMFISHCRT13_116" "CMFISHCRT13_117" "CMFISHCRT13_118" "CMFISHCRT13_119" "CMFISHCRT13_120"
[121] "CMFISHCRT13_121" "CMFISHCRT13_122" "CMFISHCRT13_123" "CMFISHCRT13_124" "CMFISHCRT13_125" "CMFISHCRT13_126" "CMFISHCRT13_127" "CMFISHCRT13_128"
[129] "CMFISHCRT13_129" "CMFISHCRT13_130" "CMFISHCRT13_131" "CMFISHCRT13_132" "CMFISHCRT13_133" "CMFISHCRT13_134" "CMFISHCRT13_135" "CMFISHCRT13_136"
[137] "CMFISHCRT13_137" "CMFISHCRT13_138" "CMFISHCRT13_139" "CMFISHCRT13_140" "CMFISHCRT13_141" "CMFISHCRT13_142" "CMFISHCRT13_143" "CMFISHCRT13_144"
[145] "CMFISHCRT13_145" "CMFISHCRT13_146" "CMFISHCRT13_147" "CMFISHCRT13_148" "CMFISHCRT13_149" "CMFISHCRT13_150" "CMFISHCRT13_151" "CMFISHCRT13_152"
[153] "CMFISHCRT13_153" "CMFISHCRT13_154" "CMFISHCRT13_155" "CMFISHCRT13_156" "CMFISHCRT13_157" "CMFISHCRT13_158" "CMFISHCRT13_159" "CMFISHCRT13_160"
[161] "CMFISHCRT13_161" "CMFISHCRT13_162" "CMFISHCRT13_163" "CMFISHCRT13_164" "CMFISHCRT13_165" "CMFISHCRT13_166" "CMFISHCRT13_167" "CMFISHCRT13_168"
[169] "CMFISHCRT13_169" "CMFISHCRT13_170" "CMFISHCRT13_171" "CMFISHCRT13_172" "CMFISHCRT13_173" "CMFISHCRT13_174" "CMFISHCRT13_175" "CMFISHCRT13_176"
[177] "CMFISHCRT13_177" "CMFISHCRT13_178" "CMFISHCRT13_179" "CMFISHCRT13_180" "CMFISHCRT13_181" "CMFISHCRT13_182" "CMFISHCRT13_183" "CMFISHCRT13_184"
[185] "CMFISHCRT13_185" "CMFISHCRT13_186" "CMFISHCRT13_187" "CMFISHCRT13_188" "CMFISHCRT13_189" "CMFISHCRT13_190" "CMFISHCRT13_191" "CMFISHCRT13_192"
[193] "CMFISHCRT13_193" "CMFISHCRT13_194" "CMFISHCRT13_195" "CMFISHCRT13_196" "CMFISHCRT13_197" "CMFISHCRT13_198" "CMFISHCRT13_199" "CMFISHCRT13_200"
[201] "CMFISHCRT13_201" "CMFISHCRT13_202" "CMFISHCRT13_203" "CMFISHCRT13_204" "CMFISHCRT13_205" "CMFISHCRT13_206" "CMFISHCRT13_207" "CMFISHCRT13_208"
[209] "CMFISHCRT13_209" "CMFISHCRT13_210" "CMFISHCRT13_211" "CMFISHCRT13_212" "CMFISHCRT13_213" "CMFISHCRT13_214" "CMFISHCRT13_215" "CMFISHCRT13_216"
[217] "CMFISHCRT13_217" "CMFISHCRT13_218" "CMFISHCRT13_219" "CMFISHCRT13_220" "CMFISHCRT13_221" "CMFISHCRT13_222" "CMFISHCRT13_223" "CMFISHCRT13_224"
[225] "CMFISHCRT13_225" "CMFISHCRT13_226" "CMFISHCRT13_227" "CMFISHCRT13_228" "CMFISHCRT13_229" "CMFISHCRT13_230" "CMFISHCRT13_231" "CMFISHCRT13_232"
[233] "CMFISHCRT13_233" "CMFISHCRT13_234" "CMFISHCRT13_235" "CMFISHCRT13_236" "CMFISHCRT13_237" "CMFISHCRT13_238" "CMFISHCRT13_239" "CMFISHCRT13_240"
[241] "CMFISHCRT13_241"

Correct. Add these to metadata.

metadata <- 
  dplyr::bind_rows(metadata,
                   CMFISHCRT13.gcl %>% 
                     dplyr::select(SILLY_CODE, FK_FISH_ID, SillySource, DNA_TRAY_CODE, DNA_TRAY_WELL_CODE, CAPTURE_DATE) %>% 
                     dplyr::rename(silly = SILLY_CODE,
                                   fish_id = FK_FISH_ID,
                                   silly_source = SillySource, 
                                   dna_tray_code = DNA_TRAY_CODE,
                                   dna_tray_well_code = DNA_TRAY_WELL_CODE,
                                   date = CAPTURE_DATE) %>% 
                     tidyr::unite(col = "sample", dna_tray_code:dna_tray_well_code, sep = "_", remove = FALSE) %>% 
                     tidyr::unite(col = "silly_source", c(silly, fish_id), sep = "_", remove = FALSE) %>% 
                     dplyr::mutate(franz_id = stringr::str_c(stringr::str_sub(silly, 1, 2),  # 1st letter of species name, 1st letter of stream name
                                                             stringr::str_sub(silly, -2, -1),  # 2 digit sample year
                                                             "_",
                                                             stringr::str_pad(fish_id, width = 5, pad = "0", side = "left")),  # 5 digit fishid using padded zeros up front)
                                   date = lubridate::as_date(date),
                                   year = lubridate::year(date),
                                   stream = "Fish - Douglas Island",
                                   DOY = lubridate::yday(date)  # day of year
                     ) 
)

Verify all samples have metadata.

table(geno_silly_source %in% metadata$silly_source)

 TRUE 
18810 

Excellent, all present and accounted for.

Record which samples were genotyped.

metadata <- metadata %>% 
  dplyr::mutate(genotyped = silly_source %in% geno_silly_source)

How many samples per collection (silly)?

metadata %>% 
  dplyr::count(genotyped, silly) %>% 
  tidyr::pivot_wider(names_from = genotyped, values_from = n)

Check For Sample Duplicates

I got a weird many-to-many error in a join the first time I ran through this script, so I want to check and see if there are actually duplicate samples present and if so, remove them!

# bind all samples together into a single .gcl object
my.gcl <- sapply(sillyvec, function(silly) {
  my.silly <- get(paste0(silly, ".gcl"))
}, simplify = FALSE) %>% 
  dplyr::bind_rows()

Any duplicate SillySource?

table(table(my.gcl$SillySource))

    1 
18810 

Clean up

Are there metadata duplicates?

table(table(metadata$silly_source))

    1     2 
27197    45 

Well son of a biscuit, there are!

metadata %>% 
  dplyr::filter(duplicated(silly_source) | duplicated(silly_source, fromLast = TRUE)) %>% 
  dplyr::arrange(silly_source)

Okay, they appear to be true duplicates, no conflicting columns. Remove duplicate rows.

metadata <- metadata %>% 
  dplyr::distinct(silly_source, .keep_all = TRUE)

Join Metadata

Join metadata to each silly by SillySource.

x <- sapply(sillyvec, function(silly) {
  my.silly <- get(paste0(silly, ".gcl"))
  my.silly <- my.silly %>% dplyr::left_join(
    y = metadata %>% dplyr::select(-genotyped),
    by = dplyr::join_by(SillySource == silly_source)
  )
  assign(x = paste0(silly, ".gcl"),
         value = my.silly,
         pos = 1)
})
rm(x)  # suppresses output

Filter QC Crosscheck Plates

Create a vector of plate IDs to toss, pending QC crosschecks.

plate_ids_qcxcheck <- c(64461, 64462, 64480, 64482, 64483, 64484, 64485, 64486, 64502, 64503, 64509, 64511, 64512, 64513, 64514, 64515, 64542, 64543, 64582, 56362, 56363) %>% as.character()

Get sample size by silly.

(
  sample_size_qcxcheck <- dplyr::tibble(silly = sillyvec) %>%
    dplyr::mutate(genotyped = GCLr::silly_n(sillyvec = sillyvec) %>% dplyr::pull(n))
)

Remove any fish on the plates pending QC crosschecks.

x <- sapply(sillyvec, function(silly) {
  my.silly <- get(paste0(silly, ".gcl"))
  my.silly <- my.silly %>% dplyr::filter(!stringr::str_detect(string = PLATE_ID, pattern = stringr::str_c(plate_ids_qcxcheck, collapse = "|")))
  assign(x = paste0(silly, ".gcl"),
         value = my.silly,
         pos = 1)
})
rm(x)  # suppresses output

How many fish were removed by silly?

(
  sample_size_qcxcheck <- sample_size_qcxcheck %>%
    dplyr::mutate(qcxcheck = genotyped - GCLr::silly_n(sillyvec = sillyvec) %>% dplyr::pull(n))
)

How many total samples?

sum(sample_size_qcxcheck$qcxcheck)
[1] 1995

Okay, that looks about right for 21 plates. Must have been some partial plates.

Genotype QA

Standard data QA (GT-seq, no conScore):

  • Remove fish missing genotypes (<80% loci)
  • Remove contamination (heterozygosity outliers +/- 3.5 modified z-score)
  • Remove duplicates (>95% loci concordance, remove both duplicates)
(
  sample_size_qa <- dplyr::tibble(silly = sillyvec) %>%
    dplyr::mutate(genotyped = GCLr::silly_n(sillyvec = sillyvec) %>% dplyr::pull(n))
)

Missing Loci (<80%)

Remove fish with <80% loci genotyped.

miss_loci <-
  GCLr::remove_ind_miss_loci(sillyvec = sillyvec,
                             proportion = 0.8)
G3;30 IDs were removed from CMADMCR13.gcl
gG3;30 IDs were removed from CMADMCR17.gcl
gG3;4 IDs were removed from CMADMCR18.gcl
gG3;6 IDs were removed from CMFISHCR13.gcl
gG3;110 IDs were removed from CMFISHCR14.gcl
gG3;111 IDs were removed from CMFISHCR17.gcl
gG3;79 IDs were removed from CMFISHCR18.gcl
gG3;182 IDs were removed from CMFISHCR19.gcl
gG3;6 IDs were removed from CMFISHCR21.gcl
gG3;29 IDs were removed from CMFISHCR22.gcl
gG3;36 IDs were removed from CMFISHCR23.gcl
gG3;2 IDs were removed from CMFISHCRT13.gcl
gG3;2 IDs were removed from CMFISHCRT21.gcl
gG3;3 IDs were removed from CMFISHCRT22.gcl
gG3;1 IDs were removed from CMFISHCRT23.gcl
gG3;42 IDs were removed from CMPROSCR13.gcl
gG3;5 IDs were removed from CMPROSCR14.gcl
gG3;17 IDs were removed from CMPROSCR17.gcl
gG3;8 IDs were removed from CMPROSCR18.gcl
gG3;13 IDs were removed from CMPROSCR19.gcl
gG3;2 IDs were removed from CMPROSCR21.gcl
gG3;9 IDs were removed from CMPROSCR22.gcl
gG3;2 IDs were removed from CMPROSCRT22.gcl
gG3;23 IDs were removed from CMSAWCR13.gcl
gG3;1 IDs were removed from CMSAWCR14.gcl
gG3;6 IDs were removed from CMSAWCR15.gcl
gG3;46 IDs were removed from CMSAWCR17.gcl
gG3;17 IDs were removed from CMSAWCR18.gcl
gG3;17 IDs were removed from CMSAWCR19.gcl
gG3;8 IDs were removed from CMSAWCR21.gcl
gG3;25 IDs were removed from CMSAWCR22.gcl
gG3;A total of 872 individuals were removed from sillys in sillyvec.
g
# miss_loci$IDs_Removed

GCLr::save_objects(objects = "miss_loci", path =  "../objects", rds = TRUE)

(
  sample_size_qa <- sample_size_qa %>%
    dplyr::mutate(missing = genotyped - GCLr::silly_n(sillyvec = sillyvec) %>% dplyr::pull(n))
)

Overall 872 samples dropped due to poor quality DNA.

What was genotyping success by spawning state?

miss_loci_tib <- tibble::enframe(miss_loci$IDs_Removed, name = "silly", value = "fish_id") %>% 
  tidyr::unnest_longer(fish_id) %>% 
  dplyr::left_join(y = metadata, by = dplyr::join_by(silly, fish_id))

metadata %>% 
  dplyr::filter(genotyped) %>% 
  dplyr::mutate(drop_miss = silly_source %in% miss_loci_tib$silly_source) %>% 
  dplyr::count(drop_miss, spawning_state) %>% 
  tidyr::pivot_wider(names_from = drop_miss, values_from = n) %>% 
  dplyr::rename(n_drop = `TRUE`, n_keep = `FALSE`) %>% 
  dplyr::mutate(n = n_keep + n_drop,
                p_keep = n_keep / n,
                spawning_state = factor(spawning_state, levels = c("Alive", "Pink Gill", "Grey Gill", "Rotting"))) %>% 
  dplyr::arrange(spawning_state)

As expected, genotyping success was much lower for rotting fish.

Contamination (Heterozygosity > +/- 3.5 modified Z-score)

Remove contaminated samples prior to checking for duplicates!

Individual heterozygosity outliers is our best metric for removing contaminated samples in the absence of something more sophisticated like conScore from GTscore. In previous analyses, including Shedd et al. 2022, we’ve used the standard 1.5 IQR outlier detection method to remove excessively heterozygous individuals. However, the 1.5 IQR method assumes a normal distribution, but heart sample heterozygosities tend to be skewed right due to contamination. As of 2024-02-08, we decided to pivot to using modified Z-scores with cutoffs of +/- 3.5 as recommended by Iglewicz and Hoaglin 1. The modified Z-score uses the median and median absolute deviation (MAD) instead of the mean and standard deviation, and is thus more robust to outliers and asymmetrical distributions.

Source function and calculate individual heterozygosities from AHRP-PWS-Pink-Salmon-Pedigrees GitHub repo.

source("~/GitHub_repos/AHRP-PWS-Pink-Salmon-Pedigrees/code/functions/calc_ind_het.R")

(
  ind_het <-
    calc_ind_het(
      sillyvec = sillyvec,
      loci = loci251,
      ncores = parallel::detectCores() - 4
    )
)
Time difference of 17.24236 secs

Calculate +/- 3.5 modified z-score cutoffs by stream. NOTE this means that the exact heterozygosity cutoff will vary slightly by stream. For PWS Pink Salmon, I did it separately by silly, but some of our sample sizes are pretty small here and we have the separate “tag” sillys. I’m opting to do this by stream to allow for subtle changes in allele frequency per stream.

# Function to calculate median absolute deviation (MAD)
mad <- function(x) {
  median(abs(x - median(x)))
}

# Calculate the modified z-score for each individual
(
  ind_het <- ind_het %>%
    dplyr::mutate(
      year = 2000 + as.numeric(stringr::str_sub(
        string = silly,
        start = -2,
        end = -1
      )),
      stream = dplyr::case_when(
        stringr::str_detect(string = silly, pattern = "ADM") ~ "Admiralty",
        stringr::str_detect(string = silly, pattern = "FISH") ~ "Fish",
        stringr::str_detect(string = silly, pattern = "PROS") ~ "Prospect",
        stringr::str_detect(string = silly, pattern = "SAW") ~ "Sawmill",
        TRUE ~ "mistakes_were_made"
      )
    ) %>%
    dplyr::group_by(stream) %>%  # cutoffs are specific to each stream!
    dplyr::mutate(
      modified_z_score = 0.6745 * (het - median(het)) / mad(het),
      outlier = dplyr::case_when(abs(modified_z_score) > 3.5 ~ TRUE, TRUE ~ FALSE)
    ) %>%
    dplyr::ungroup()
)
  
GCLr::save_objects("ind_het", path = "objects", rds = TRUE)

Plot distribution of heterozygosity and show outliers.

ind_het %>% 
  ggplot2::ggplot(ggplot2::aes(x = het, fill = outlier)) +
  ggplot2::geom_histogram(binwidth = 1 / length(loci251)) +
  ggplot2::facet_grid(rows = ggplot2::vars(stream), scales = "free_y") +
  ggplot2::xlim(0, 1) +
  ggplot2::scale_fill_manual(name = "Outlier", values = c("TRUE" = "black", "FALSE" = "grey60")) +
  ggplot2::xlab("Individual Heterozygosity") +
  ggplot2::ylab("Frequency") +
  ggplot2::ggtitle("Individual Heterozygosity - By Stream") +
  ggplot2::theme_bw(base_size = 14) 

Remove outliers.

output <- lapply(sillyvec, function(x) {
  GCLr::remove_ids(
    silly = x,
    IDs = ind_het %>% dplyr::filter(outlier, silly == x) %>% dplyr::pull(fish_id)
  )
})
G3;8 IDs were removed from CMADMCR13.gcl
gG3;5 IDs were removed from CMADMCR14.gcl
gG3;147 IDs were removed from CMADMCR17.gcl
gG3;2 IDs were removed from CMADMCR18.gcl
gG3;11 IDs were removed from CMFISHCR13.gcl
gG3;113 IDs were removed from CMFISHCR14.gcl
gG3;111 IDs were removed from CMFISHCR17.gcl
gG3;49 IDs were removed from CMFISHCR18.gcl
gG3;55 IDs were removed from CMFISHCR19.gcl
gG3;2 IDs were removed from CMFISHCR20.gcl
gG3;7 IDs were removed from CMFISHCR21.gcl
gG3;17 IDs were removed from CMFISHCR22.gcl
gG3;100 IDs were removed from CMFISHCR23.gcl
gG3;1 IDs were removed from CMFISHCRT13.gcl
gG3;0 IDs were removed from CMFISHCRT21.gcl
gG3;1 IDs were removed from CMFISHCRT22.gcl
gG3;3 IDs were removed from CMFISHCRT23.gcl
gG3;18 IDs were removed from CMPROSCR13.gcl
gG3;10 IDs were removed from CMPROSCR14.gcl
gG3;153 IDs were removed from CMPROSCR17.gcl
gG3;6 IDs were removed from CMPROSCR18.gcl
gG3;50 IDs were removed from CMPROSCR19.gcl
gG3;0 IDs were removed from CMPROSCR20.gcl
gG3;2 IDs were removed from CMPROSCR21.gcl
gG3;10 IDs were removed from CMPROSCR22.gcl
gG3;0 IDs were removed from CMPROSCRT21.gcl
gG3;0 IDs were removed from CMPROSCRT22.gcl
gG3;17 IDs were removed from CMSAWCR13.gcl
gG3;2 IDs were removed from CMSAWCR14.gcl
gG3;4 IDs were removed from CMSAWCR15.gcl
gG3;93 IDs were removed from CMSAWCR17.gcl
gG3;1 IDs were removed from CMSAWCR18.gcl
gG3;0 IDs were removed from CMSAWCR19.gcl
gG3;0 IDs were removed from CMSAWCR20.gcl
gG3;2 IDs were removed from CMSAWCR21.gcl
gG3;0 IDs were removed from CMSAWCR22.gcl
gG3;0 IDs were removed from CMSAWCRT21.gcl
gG3;0 IDs were removed from CMSAWCRT22.gcl
g
message(paste0("A total of ", ind_het %>% dplyr::filter(outlier) %>% nrow(), " individuals were removed from sillys in sillyvec."))
G3;A total of 1000 individuals were removed from sillys in sillyvec.
g

Not too many contaminated samples! Most likely culled during missing loci.

(
  sample_size_qa <- sample_size_qa %>%
    dplyr::mutate(heterozygosity = genotyped - missing - GCLr::silly_n(sillyvec = sillyvec) %>% dplyr::pull(n))
)

Overall 1,000 samples dropped due to sample contamination.

Duplicate (>=95%)

Identify potential duplicate genotypes (>=95% loci).

duplicate_check_95 <-
  GCLr::dupcheck_within_silly(
    sillyvec = sillyvec,
    minproportion = 0.95,
    minnonmissing = 0.6,
    ncores = parallel::detectCores() - 4
  )
Time difference of 21.62875 secs
GCLr::save_objects("duplicate_check_95", path = "objects", rds = TRUE)

How many duplicate pairs by silly?

duplicate_check_95 %>% 
  dplyr::count(silly) %>% 
  dplyr::arrange(dplyr::desc(n))

Remove both duplicates! As opposed to GSI work, where we want to keep individuals but aren’t typically worried about paired data, here we want to remove both individuals as the paired data integrity (including otolith reads!) is lost.

duplicates_removed <-
  GCLr::remove_dups(dupcheck = duplicate_check_95, remove_both = TRUE)
G3;2 IDs were removed from CMADMCR14.gcl
gG3;20 IDs were removed from CMADMCR17.gcl
gG3;6 IDs were removed from CMADMCR18.gcl
gG3;2 IDs were removed from CMFISHCR13.gcl
gG3;12 IDs were removed from CMFISHCR14.gcl
gG3;14 IDs were removed from CMFISHCR17.gcl
gG3;38 IDs were removed from CMFISHCR18.gcl
gG3;18 IDs were removed from CMFISHCR19.gcl
gG3;4 IDs were removed from CMFISHCR23.gcl
gG3;2 IDs were removed from CMFISHCRT23.gcl
gG3;2 IDs were removed from CMPROSCR13.gcl
gG3;4 IDs were removed from CMPROSCR14.gcl
gG3;14 IDs were removed from CMPROSCR17.gcl
gG3;4 IDs were removed from CMPROSCR18.gcl
gG3;10 IDs were removed from CMPROSCR19.gcl
gG3;4 IDs were removed from CMPROSCR20.gcl
gG3;2 IDs were removed from CMPROSCR22.gcl
gG3;8 IDs were removed from CMSAWCR17.gcl
gG3;4 IDs were removed from CMSAWCR18.gcl
gG3;2 IDs were removed from CMSAWCR22.gcl
g
GCLr::save_objects("duplicates_removed", path = "objects", rds = TRUE)

(
  sample_size_qa <- sample_size_qa %>%
    dplyr::mutate(
      duplicate = genotyped - missing - heterozygosity - GCLr::silly_n(sillyvec = sillyvec) %>% dplyr::pull(n)
    )
)

Overall 157 samples dropped due to duplicate genotypes.

Final

Final, post-QA sample sizes by silly.

(
  sample_size_qa <- sample_size_qa %>%
    dplyr::mutate(
      final =  GCLr::silly_n(sillyvec = sillyvec) %>% dplyr::pull(n)
    )
)

GCLr::save_objects("sample_size_qa", path = "objects", rds = TRUE)

fs::dir_create("output")
readr::write_csv(x = sample_size_qa, file = "output/sample_size_qa.csv")

Save

Save final genotypes and update metadata.

Genotypes

Save post-QA genotypes with joined metadata.

fs::dir_create("data/genotypes/postQA")
GCLr::save_sillys(sillyvec = sillyvec, path = "data/genotypes/postQA", rds = TRUE)

Metadata

add pass_QA

postQA_silly_source <- sapply(sillyvec, function(silly) {get(paste0(silly, ".gcl"))$SillySource}) %>% unlist()

metadata <- metadata %>% 
  dplyr::mutate(pass_QA = silly_source %in% postQA_silly_source)

Confirm.

metadata %>% 
  dplyr::count(genotyped, pass_QA)

NOTE that genotyped in sample_size_qa doesn’t match genotyped in metadata because I pre-filtered out QC Crosscheck plates.

sample_size_qa %>% 
  janitor::adorn_totals() %>% 
  dplyr::filter(silly == "Total")
 silly genotyped missing heterozygosity duplicate final
 Total     16815     872           1000       157 14786

QA Conclusions

Overall 18,810 total pedigree samples were genotyped; 1,995 were removed due to pending QC Crosschecks; 872 were removed due to poor quality DNA (missing >= 20% loci); 1,000 were removed due to contaminated DNA (heterozygosity >= 3.5 modified z-score); and 157 were removed due to duplicate genotypes (>= 95% loci), leaving a total of 14,786 in the final dataset.

Adaptive Loci Genotypes

Create a single object with just the metadata and adaptive loci genotypes.

Calculate Allele Frequencies

Calculate the allele counts for each locus by stream, year, and DOY. Adapted from Pat Barry’s code in AHRP_ChumAdaptive.html

loci_adaptive <- grep(pattern = "OkeV2", x = loci251, value = TRUE)

# testing
# chum_adaptive_geno %>%
#   dplyr::select(silly_source, stream, year, DOY, tidyselect::starts_with(loci_adaptive[1])) %>%
#   tidyr::pivot_longer(cols = tidyselect::starts_with(loci_adaptive[1]), names_to = "locus", values_to = "allele") %>%
#   dplyr::mutate(allele = factor(allele),
#                 allele = as.numeric(allele)) %>%  # make factor and convert to numeric
#   dplyr::count(stream, year, DOY, allele) %>%
#   dplyr::mutate(locus = loci_adaptive[1])

# calculate the number of each allele observed per stream, year, day
(chum_adaptive_allele_counts <- lapply(loci_adaptive, function(locus) {
  chum_adaptive_geno %>%
    dplyr::select(silly_source,
                  stream,
                  year,
                  DOY,
                  tidyselect::starts_with(locus)) %>%
    tidyr::pivot_longer(
      cols = tidyselect::starts_with(locus),
      names_to = "locus",
      values_to = "allele"
    ) %>%
    dplyr::mutate(allele = factor(allele),
                allele = as.numeric(allele)) %>%  # make factor and convert to numeric
    dplyr::count(stream, year, DOY, allele) %>%
    dplyr::mutate(locus = locus)
}) %>%
    dplyr::bind_rows())

Convert to alelle frequencies.

(
  chum_adaptive_allele_freq <- chum_adaptive_allele_counts %>%
    tidyr::pivot_wider(names_from = allele, values_from = n) %>%
    dplyr::select(-`NA`) %>%  # drop no-calls
    dplyr::mutate(
      `1` = replace_na(`1`, 0),
      # account for unobserved alleles on a given day
      `2` = replace_na(`2`, 0)
    ) %>%
    tidyr::pivot_longer(
      cols = `1`:`2`,
      names_to = "allele",
      values_to = "n"
    ) %>%
    dplyr::group_by(stream, year, DOY, locus) %>%
    dplyr::mutate(
      n_alleles = sum(n),
      p_alleles = n / n_alleles,
      SNP = paste(locus, allele, sep = "_")
    ) %>% 
    dplyr::ungroup()
)

View Allele Frequencies

#  TODO
# double check allele frequency code to make sure it is correct
# make sure alleles are being assigned 1 vs. 2 consistently
# filter out hatchery strays?
# check sample sizes by stream/year/DOY?
# see if spawning state correlates?

Admiralty

chum_adaptive_allele_freq %>% 
  dplyr::filter(stream == "Admiralty") %>% 
  dplyr::mutate(locus = stringr::str_remove_all(string = locus, pattern = "OkeV2_")) %>% 
  ggplot2::ggplot(ggplot2::aes(x = DOY, y = p_alleles, colour = SNP, group_by(SNP))) +
  ggplot2::geom_point() +
  ggplot2::geom_smooth(method = "lm", weight = "n_alleles", formula = y~x) +
  ggplot2::facet_grid(locus ~ year) + 
  ggplot2::theme_bw() + 
  ggplot2::theme(legend.position = 'none')

Fish

chum_adaptive_allele_freq %>% 
  dplyr::filter(stream == "Fish - Douglas Island") %>% 
  dplyr::mutate(locus = stringr::str_remove_all(string = locus, pattern = "OkeV2_")) %>% 
  ggplot2::ggplot(ggplot2::aes(x = DOY, y = p_alleles, colour = SNP, group_by(SNP))) +
  ggplot2::geom_point() +
  ggplot2::geom_smooth(method = "lm", weight = "n_alleles", formula = y~x) +
  ggplot2::facet_grid(locus ~ year) + 
  ggplot2::theme_bw() + 
  ggplot2::theme(legend.position = 'none')

Prospect

chum_adaptive_allele_freq %>% 
  dplyr::filter(stream == "Prospect") %>% 
  dplyr::mutate(locus = stringr::str_remove_all(string = locus, pattern = "OkeV2_")) %>% 
  ggplot2::ggplot(ggplot2::aes(x = DOY, y = p_alleles, colour = SNP, group_by(SNP))) +
  ggplot2::geom_point() +
  ggplot2::geom_smooth(method = "lm", weight = "n_alleles", formula = y~x) +
  ggplot2::facet_grid(locus ~ year) + 
  ggplot2::theme_bw() + 
  ggplot2::theme(legend.position = 'none')

Sawmill

chum_adaptive_allele_freq %>% 
  dplyr::filter(stream == "Sawmill") %>% 
  dplyr::mutate(locus = stringr::str_remove_all(string = locus, pattern = "OkeV2_")) %>% 
  ggplot2::ggplot(ggplot2::aes(x = DOY, y = p_alleles, colour = SNP, group_by(SNP))) +
  ggplot2::geom_point() +
  ggplot2::geom_smooth(method = "lm", weight = "n_alleles", formula = y~x) +
  ggplot2::facet_grid(locus ~ year) + 
  ggplot2::theme_bw() + 
  ggplot2::theme(legend.position = 'none')

end


  1. Boris Iglewicz and David Hoaglin (1993), “Volume 16: How to Detect and Handle Outliers”, The ASQC Basic References in Quality Control: Statistical Techniques, Edward F. Mykytka, Ph.D., Editor.↩︎

---
title: "Export Genotypes for 9 Adaptive Run Timing Loci to NOAA"
subtitle: "Filtered for Plates Not Needing QC Crosschecks"
author: "Kyle Shedd"
date: "Started: 2025-07-24, last opened: `r Sys.Date()`"
output:
  html_notebook:
    theme: united
    toc: yes
    toc_float: true
editor_options: 
  chunk_output_type: inline
---

# Setup

```{r setup, message=FALSE, results='hide'}
rm(list = ls(all.names = TRUE))

if(!require("pacman")) install.packages("pacman"); library(pacman)

pacman::p_load(
  tidyverse,
  scales,
  janitor,
  GCLr,
  coin
)

knitr::opts_chunk$set(fig.width = 10)
knitr::opts_chunk$set(echo = TRUE)

.username = readLines("~/usr_pw.txt", n = 1)  # LOKI username
.password = readLines("~/usr_pw.txt" , n = 2)[[2]]  # LOKI password
```

# Objective

The purpose of this notebook is to provide Wes Larson (NOAA ABL) with adaptive run timing genotypes (9 loci) for AHRP chum samples. Filter out plates undergoing QC crosschecks due to poor genotyping success. Wes is interested in representation across years, less across multiple streams. He wants to see how the patterns hold up over time.

## Steps

  * import all existing genotypes for CM064-CM070
  * filter out untrustworthy plates that need QC cross checks
  * standard GT-seq genotype QA (missing, heterozygosity, duplicate)
  * join sample date and spawning state <"../OceanAK/AHRP Salmon Biological Data 20240105_122404.368459_with_spawning_state.csv">
  * filter for 9 adaptive run timing loci
  * export genotypes with date/spawning state

# Background

An adaptive marker panel associated with run timing variation in chum salmon was developed by NOAA ABL and used to genotype two plates of samples from the ADFG Alaska Hatchery Research Program (AHRP) study being conducted within SE Alaska. AHRP samples (Fish Creek - Douglas Island 2023 + Prospect Creek 2017/2018) were genotyped in August 2024 for 20 single nucleotide polymorphisms and weighted linear regressions were performed to assess the association of different SNPs with run timing (date of collection). The chum salmon adaptive marker panel contains 20 loci spread among 5 chromosomes and was developed from variation detected between Yukon River summer (Gisasa River) and fall (Pelly River) run chum salmon with low coverage whole genome sequencing.These populations spawn approximately two months and over 1500 river km apart. NOAA targeted the major peaks in differentiation with particular interest in markers within the genes leucine rich repeat containing nine (LRRC9; LG35) and estrogen receptor β (ESRB; LG29) as they were associated with return timing in other salmonid species. Eight SNPs (four on LG29 and four on LG35) appear to be associated with run timing within these collections. The strength of association for AHRP samples appears to vary by population. No markers on LGs 5, 23 or 15 appear to have a strong association with run timing within these collections. Overall, the results from LG29 and LG35 are promising and variation in these SNPs should be surveyed for a larger collection of samples.

A total of 9 adaptive, run timing loci were added to the AHRP SEAK Chum parentage panel: 

  * OkeV2_LG23_10546237
  * OkeV2_LG29_25450752
  * OkeV2_LG29_25455833
  * OkeV2_LG29_25483595
  * OkeV2_LG29_25515161
  * OkeV2_LG35_28078867
  * OkeV2_LG35_28128687
  * OkeV2_LG35_28165076
  * OkeV2_LG35_28167172

All AHRP Chum samples from Fish Creek - Douglas Island, Prospect Creek, Admiralty Creek, and Sawmill Creek were genotyped for the parentage panel at the ADF&G Gene Conservation Laboratory (GCL) in Winter/Spring 2025. Due to poor tissue quality, there was a high proportion of samples that failed to genotype and/or appear to be contaminated (high heterozygosity), resulting in low QC power for several plates. QC crosschecks are currently underway at the GCL in July 2025, however, Wes wants whatever genotypes we have on hand now so will just filter out those plates.

# Import Data

## *LocusControl*

Create `objects`, create *LocusControl* for `Chum_AHRP_257_v1.3.0.`
```{r LocusControl, eval=FALSE}
fs::dir_create("objects")
GCLr::create_locuscontrol(markersuite = "Chum_AHRP_257_v1.3.0",
                          username = .username,
                          password = .password)
GCLr::save_objects(objects = "LocusControl", path = "objects")
```

Hrmm, only 251 loci. Confirm that these are the same loci run in the most recent AHRP Chum project CM069
```{r, eval=FALSE}
probe_loci_CM069 <- readr::read_tsv(
  file = "V:/Lab/Genotyping/SNP Projects/Chum/Project CM069 AHRP Parentage/Genotype Data Files/set2rr_CM069_outputs/probes.txt", 
  show_col_types = FALSE) %>%
  dplyr::distinct(`#Locus_ID`) %>%
  dplyr::pull()

all.equal(probe_loci_CM069, LocusControl$locusnames)
```

Yes, all are the same.
```{r, eval=FALSE}
loci251 <- LocusControl$locusnames
GCLr::save_objects(objects = "loci251", path = "objects")
```

## Genotypes

Read in genotypes by projec to get sillyvec.
```{r, eval=FALSE}
rm(LocusControl)
# GCLr::loki2r_proj(project_name = paste0("CM0", 64:70), username = .username, password = .password)  # comment out to avoid re-running
```

What collections?
```{r, eval=FALSE}
(sillyvec <- sort(project_sillys))
```

Subset `sillyvec` to remove alevin collection
```{r, eval=FALSE}
(sillyvec <- grep(pattern = "a", x = sillyvec, ignore.case = FALSE, value = TRUE, invert = TRUE))
```

Wipe out `.gcl` objects and re-read in using `loki2r`.
```{r, eval=FALSE}
GCLr::save_objects(objects = "sillyvec", path = "objects")

rm(list = objects(pattern = ".gcl"))
rm(LocusControl)
```

Read in genotypes.
```{r eval=FALSE}
GCLr::load_objects(path = "objects", pattern = "LocusControl")

GCLr::loki2r(sillyvec = sillyvec,
             username = .username,
             password = .password, 
             test_type = "GTSNP")
```

Only took 5.39 minutes to read everything in at the office!

Save raw genotypes (useful for working offsite and off VPN).
```{r eval=FALSE}
fs::dir_create("data/genotypes")
fs::dir_create("data/genotypes/raw")
# GCLr::save_sillys(sillyvec = sillyvec, path = "data/genotypes/raw", rds = TRUE)  # hide so you don't overwrite on accident!
```


### Re-load

Re-load, if necessary
```{r}
GCLr::load_objects(path = "objects")
GCLr::load_sillys(sillyvec = sillyvec, path = "data/genotypes/raw", rds = TRUE)
```

## Metadata

Read in paired metadata from Salmon Biological Fact and Stream Specimens, format similar to `pws_pink_ped_dat`. **NOTE** I'm doing this in a bit of a hurry, so make sure to proof read carefully if recycling this code!
```{r}
(metadata <- readr::read_csv(file = "../OceanAK/AHRP Salmon Biological Data 20240105_122404.368459_with_spawning_state.csv"))
```

Modify
```{r}
(
  metadata <- metadata %>% 
    dplyr::select(-dplyr::contains("target")) %>% 
    dplyr::select(-dplyr::contains("determination")) %>% 
    dplyr::rename(date = sample_date,
                  length = length_mm,
                  silly = silly_code) %>% 
    tidyr::unite(col = "sample", dna_tray_code:dna_tray_well_code, sep = "_", remove = FALSE) %>% 
    tidyr::unite(col = "silly_source", c(silly, fish_id), sep = "_", remove = FALSE) %>% 
    dplyr::mutate(franz_id = stringr::str_c(stringr::str_sub(silly, 1, 2),  # 1st letter of species name, 1st letter of stream name
                                            stringr::str_sub(silly, -2, -1),  # 2 digit sample year
                                            "_",
                                            stringr::str_pad(fish_id, width = 5, pad = "0", side = "left")),  # 5 digit fishid using padded zeros up front)
                  year = lubridate::year(date),
                  stream = stringr::str_remove(string = location_code, pattern = " Creek"),
                  DOY = lubridate::yday(date),  # day of year
                  sex = dplyr::case_when(
                    sex == "M" ~ "Male",
                    sex == "F" ~ "Female",
                    sex == "U" ~ NA_character_,
                    is.na(sex) ~ NA_character_),
                  origin = dplyr::case_when(
                    otolith_mark_present == "NO" ~ "Natural",
                    otolith_mark_present == "YES" ~ "Hatchery"
                  ),  # add origin variable
                  origin = base::factor(origin, levels = c("Natural", "Hatchery")),  # make factor to ensure hatchery != red
                  spawning_state = dplyr::case_when(
                    spawning_state == "Unknown" ~ NA_character_,
                    TRUE ~ spawning_state
                  )  # force Unknown to NA
    ) %>%  
    dplyr::select(
      franz_id,  # unique identifier for FRANz (e.g. PE16_00001, species, stream, year, GCL fish_id)
      sample,  # ultimate data key for AHRP, DWP barcode + DWP position
      silly_source,  # unique identifier for LOKI (GCL database)
      stream,  # 5 pedigree streams
      year,  # sample year (aka death year for FRANz)
      origin,  # hatchery or wild
      # hatchery,  # which hatchery
      sex,  # male, female, unknown
      date,  # sample date
      DOY,  # day of year (aka julian sample date)
      length,  # mideye to hypural plate length in mm
      # intertidal,  # sampling location above or below intertidal
      # distance_mouth,  # riverdist approximate sampling location from mouth in meters 
      spawning_state,  # alive, pink gill, grey gill, rotting (aka index of stream life)
      # pre_spawn,  # TRUE/FALSE (added in 2016); if the eggs and milt do not easily release and carcass has completely full gonads
      # partial_spawn,  # TRUE/FALSE (added in 2015); fish is sampled (dead or alive) with more than a little eggs or milt
      # preyed_upon,  # TRUE/FALSE (added in 2015); gonads removed by predators
      # stream_trib,  # stream tributary (Gilmour, Paddy, and Stockdale have multiple)
      # latitude,  # sampling area (where fish were moved to prior to sampling, rough approximate of spawning location)
      # longitude,  # sampling area (where fish were moved to prior to sampling, rough approximate of spawning location)
      otolith_mark_present,  # otolith mark present (yes = hatchery, no = natural)
      otolith_mark_id,  # otolith hatchery mark
      otolith_mark_status_code,  # otolith read status code (reason for no read = overground, lost, etc.)
      silly,  # GCL collection code
      fish_id,  # GCL fish ID within a silly
      dna_tray_code,  # 10-digit DWP barcode
      dna_tray_well_code,  # 48 DWP position 1:48
      # dna_tray_well_pos,  # 48 DWP position A1:H6
      # distance_tide,  # riverdist approximate sampling location from maximum high tide
      # riverdist_seg,  # riverdist river segment
      # riverdist_vert,  # riverdist vertex
      # riverdist_snapdist,  # riverdist snapping distance (aka how far was sampling location lat/long from the stream polyline)
      # high_tide  # highest extent of high tide in meters from mouth, unique to each stream
      fw_age,
      sw_age
    )  
)       
```

# Sample Sizes

Verify that all genotyped samples have metadata.
```{r}
geno_silly_source <- sapply(sillyvec, function(silly) {get(paste0(silly, ".gcl"))$SillySource}) %>% unlist()

table(geno_silly_source %in% metadata$silly_source)
```

Shoot, which samples do not have any metadata?

Ah, I bet they are the Fish Creek tag samples from 2013 that Tyler Dann and Heather Liller collected!
```{r}
setdiff(geno_silly_source, metadata$silly_source)
```

Correct. Add these to metadata.
```{r}
metadata <- 
  dplyr::bind_rows(metadata,
                   CMFISHCRT13.gcl %>% 
                     dplyr::select(SILLY_CODE, FK_FISH_ID, SillySource, DNA_TRAY_CODE, DNA_TRAY_WELL_CODE, CAPTURE_DATE) %>% 
                     dplyr::rename(silly = SILLY_CODE,
                                   fish_id = FK_FISH_ID,
                                   silly_source = SillySource, 
                                   dna_tray_code = DNA_TRAY_CODE,
                                   dna_tray_well_code = DNA_TRAY_WELL_CODE,
                                   date = CAPTURE_DATE) %>% 
                     tidyr::unite(col = "sample", dna_tray_code:dna_tray_well_code, sep = "_", remove = FALSE) %>% 
                     tidyr::unite(col = "silly_source", c(silly, fish_id), sep = "_", remove = FALSE) %>% 
                     dplyr::mutate(franz_id = stringr::str_c(stringr::str_sub(silly, 1, 2),  # 1st letter of species name, 1st letter of stream name
                                                             stringr::str_sub(silly, -2, -1),  # 2 digit sample year
                                                             "_",
                                                             stringr::str_pad(fish_id, width = 5, pad = "0", side = "left")),  # 5 digit fishid using padded zeros up front)
                                   date = lubridate::as_date(date),
                                   year = lubridate::year(date),
                                   stream = "Fish - Douglas Island",
                                   DOY = lubridate::yday(date)  # day of year
                     ) 
)
```

Verify all samples have metadata.
```{r}
table(geno_silly_source %in% metadata$silly_source)
```

Excellent, all present and accounted for.

Record which samples were genotyped.
```{r}
metadata <- metadata %>% 
  dplyr::mutate(genotyped = silly_source %in% geno_silly_source)
```

How many samples per collection (silly)?
```{r}
metadata %>% 
  dplyr::count(genotyped, silly) %>% 
  tidyr::pivot_wider(names_from = genotyped, values_from = n)
```

## Check For Sample Duplicates

I got a weird many-to-many error in a join the first time I ran through this script, so I want to check and see if there are actually duplicate samples present and if so, remove them!
```{r}
# bind all samples together into a single .gcl object
my.gcl <- sapply(sillyvec, function(silly) {
  my.silly <- get(paste0(silly, ".gcl"))
}, simplify = FALSE) %>% 
  dplyr::bind_rows()
```

Any duplicate `SillySource`?
```{r}
table(table(my.gcl$SillySource))
```

Clean up
```{r}
rm(my.gcl)
```

Are there metadata duplicates?
```{r}
table(table(metadata$silly_source))
```

Well son of a biscuit, there are!
```{r}
metadata %>% 
  dplyr::filter(duplicated(silly_source) | duplicated(silly_source, fromLast = TRUE)) %>% 
  dplyr::arrange(silly_source)
```

Okay, they appear to be true duplicates, no conflicting columns. Remove duplicate rows.
```{r}
metadata <- metadata %>% 
  dplyr::distinct(silly_source, .keep_all = TRUE)
```

## Join Metadata

Join `metadata` to each silly by `SillySource`.
```{r}
x <- sapply(sillyvec, function(silly) {
  my.silly <- get(paste0(silly, ".gcl"))
  my.silly <- my.silly %>% dplyr::left_join(
    y = metadata %>% dplyr::select(-genotyped),
    by = dplyr::join_by(SillySource == silly_source)
  )
  assign(x = paste0(silly, ".gcl"),
         value = my.silly,
         pos = 1)
})
rm(x)  # suppresses output
```

# Filter QC Crosscheck Plates

Create a vector of plate IDs to toss, pending QC crosschecks.
```{r}
plate_ids_qcxcheck <- c(64461, 64462, 64480, 64482, 64483, 64484, 64485, 64486, 64502, 64503, 64509, 64511, 64512, 64513, 64514, 64515, 64542, 64543, 64582, 56362, 56363) %>% as.character()
```

Get sample size by silly.
```{r}
(
  sample_size_qcxcheck <- dplyr::tibble(silly = sillyvec) %>%
    dplyr::mutate(genotyped = GCLr::silly_n(sillyvec = sillyvec) %>% dplyr::pull(n))
)
```

Remove any fish on the plates pending QC crosschecks.
```{r}
x <- sapply(sillyvec, function(silly) {
  my.silly <- get(paste0(silly, ".gcl"))
  my.silly <- my.silly %>% dplyr::filter(!stringr::str_detect(string = PLATE_ID, pattern = stringr::str_c(plate_ids_qcxcheck, collapse = "|")))
  assign(x = paste0(silly, ".gcl"),
         value = my.silly,
         pos = 1)
})
rm(x)  # suppresses output
```

How many fish were removed by silly?
```{r}
(
  sample_size_qcxcheck <- sample_size_qcxcheck %>%
    dplyr::mutate(qcxcheck = genotyped - GCLr::silly_n(sillyvec = sillyvec) %>% dplyr::pull(n))
)
```

How many total samples?
```{r}
sum(sample_size_qcxcheck$qcxcheck)
```

Okay, that looks about right for 21 plates. Must have been some partial plates.

# Genotype QA

Standard data QA (GT-seq, no `conScore`):

  * Remove fish missing genotypes (**<80%** loci)
  * Remove contamination (heterozygosity outliers **+/- 3.5** modified z-score)
  * Remove duplicates (**>95%** loci concordance, remove **both** duplicates)

```{r}
(
  sample_size_qa <- dplyr::tibble(silly = sillyvec) %>%
    dplyr::mutate(genotyped = GCLr::silly_n(sillyvec = sillyvec) %>% dplyr::pull(n))
)
```

## Missing Loci (<80%)

Remove fish with <80% loci genotyped.
```{r}
miss_loci <-
  GCLr::remove_ind_miss_loci(sillyvec = sillyvec,
                             proportion = 0.8)

# miss_loci$IDs_Removed

GCLr::save_objects(objects = "miss_loci", path =  "../objects", rds = TRUE)

(
  sample_size_qa <- sample_size_qa %>%
    dplyr::mutate(missing = genotyped - GCLr::silly_n(sillyvec = sillyvec) %>% dplyr::pull(n))
)
```

Overall `r format(sum(sample_size_qa$missing), big.mark = ",")` samples dropped due to poor quality DNA. 

What was genotyping success by spawning state?
```{r}
miss_loci_tib <- tibble::enframe(miss_loci$IDs_Removed, name = "silly", value = "fish_id") %>% 
  tidyr::unnest_longer(fish_id) %>% 
  dplyr::left_join(y = metadata, by = dplyr::join_by(silly, fish_id))

metadata %>% 
  dplyr::filter(genotyped) %>% 
  dplyr::mutate(drop_miss = silly_source %in% miss_loci_tib$silly_source) %>% 
  dplyr::count(drop_miss, spawning_state) %>% 
  tidyr::pivot_wider(names_from = drop_miss, values_from = n) %>% 
  dplyr::rename(n_drop = `TRUE`, n_keep = `FALSE`) %>% 
  dplyr::mutate(n = n_keep + n_drop,
                p_keep = n_keep / n,
                spawning_state = factor(spawning_state, levels = c("Alive", "Pink Gill", "Grey Gill", "Rotting"))) %>% 
  dplyr::arrange(spawning_state)
```

As expected, genotyping success was much lower for rotting fish.

## Contamination (Heterozygosity > +/- 3.5 modified Z-score)

Remove contaminated samples prior to checking for duplicates!

Individual heterozygosity outliers is our best metric for removing contaminated samples in the absence of something more sophisticated like *conScore* from [GTscore](https://github.com/gjmckinney/GTscore?tab=readme-ov-file#sample-summaries). In previous analyses, including [Shedd et al. 2022](https://doi.org/10.1111/eva.13356), we've used the standard 1.5 IQR outlier detection method to remove excessively heterozygous individuals. However, the 1.5 IQR method assumes a normal distribution, but heart sample heterozygosities tend to be skewed right due to contamination. As of 2024-02-08, we decided to pivot to using [modified Z-scores](https://www.itl.nist.gov/div898/handbook/eda/section3/eda35h.htm) with cutoffs of +/- 3.5 as recommended by Iglewicz and Hoaglin [^1]. The modified Z-score uses the median and median absolute deviation (MAD) instead of the mean and standard deviation, and is thus more robust to outliers and asymmetrical distributions.

[^1]: Boris Iglewicz and David Hoaglin (1993), "Volume 16: How to Detect and Handle Outliers", The ASQC Basic References in Quality Control: Statistical Techniques, Edward F. Mykytka, Ph.D., Editor. 

Source function and calculate individual heterozygosities from `AHRP-PWS-Pink-Salmon-Pedigrees` GitHub repo.
```{r}
source("~/GitHub_repos/AHRP-PWS-Pink-Salmon-Pedigrees/code/functions/calc_ind_het.R")

(
  ind_het <-
    calc_ind_het(
      sillyvec = sillyvec,
      loci = loci251,
      ncores = parallel::detectCores() - 4
    )
)
```

Calculate +/- 3.5 modified z-score cutoffs by *stream*. 
**NOTE** this means that the exact heterozygosity cutoff will vary slightly by stream. For PWS Pink Salmon, I did it separately by silly, but some of our sample sizes are pretty small here and we have the separate "tag" sillys. I'm opting to do this by stream to allow for subtle changes in allele frequency per stream.
```{r}
# Function to calculate median absolute deviation (MAD)
mad <- function(x) {
  median(abs(x - median(x)))
}

# Calculate the modified z-score for each individual
(
  ind_het <- ind_het %>%
    dplyr::mutate(
      year = 2000 + as.numeric(stringr::str_sub(
        string = silly,
        start = -2,
        end = -1
      )),
      stream = dplyr::case_when(
        stringr::str_detect(string = silly, pattern = "ADM") ~ "Admiralty",
        stringr::str_detect(string = silly, pattern = "FISH") ~ "Fish",
        stringr::str_detect(string = silly, pattern = "PROS") ~ "Prospect",
        stringr::str_detect(string = silly, pattern = "SAW") ~ "Sawmill",
        TRUE ~ "mistakes_were_made"
      )
    ) %>%
    dplyr::group_by(stream) %>%  # cutoffs are specific to each stream!
    dplyr::mutate(
      modified_z_score = 0.6745 * (het - median(het)) / mad(het),
      outlier = dplyr::case_when(abs(modified_z_score) > 3.5 ~ TRUE, TRUE ~ FALSE)
    ) %>%
    dplyr::ungroup()
)
  
GCLr::save_objects("ind_het", path = "objects", rds = TRUE)
```

Plot distribution of heterozygosity and show outliers.
```{r warning=FALSE, fig.height=10}
ind_het %>% 
  ggplot2::ggplot(ggplot2::aes(x = het, fill = outlier)) +
  ggplot2::geom_histogram(binwidth = 1 / length(loci251)) +
  ggplot2::facet_grid(rows = ggplot2::vars(stream), scales = "free_y") +
  ggplot2::xlim(0, 1) +
  ggplot2::scale_fill_manual(name = "Outlier", values = c("TRUE" = "black", "FALSE" = "grey60")) +
  ggplot2::xlab("Individual Heterozygosity") +
  ggplot2::ylab("Frequency") +
  ggplot2::ggtitle("Individual Heterozygosity - By Stream") +
  ggplot2::theme_bw(base_size = 14) 
```

Remove outliers.
```{r}
output <- lapply(sillyvec, function(x) {
  GCLr::remove_ids(
    silly = x,
    IDs = ind_het %>% dplyr::filter(outlier, silly == x) %>% dplyr::pull(fish_id)
  )
})

message(paste0("A total of ", ind_het %>% dplyr::filter(outlier) %>% nrow(), " individuals were removed from sillys in sillyvec."))
```

Not too many contaminated samples! Most likely culled during missing loci.
```{r}
(
  sample_size_qa <- sample_size_qa %>%
    dplyr::mutate(heterozygosity = genotyped - missing - GCLr::silly_n(sillyvec = sillyvec) %>% dplyr::pull(n))
)
```

Overall `r format(sum(sample_size_qa$heterozygosity), big.mark = ",")` samples dropped due to sample contamination. 

## Duplicate (>=95%)

Identify potential duplicate genotypes (>=95% loci).
```{r}
duplicate_check_95 <-
  GCLr::dupcheck_within_silly(
    sillyvec = sillyvec,
    minproportion = 0.95,
    minnonmissing = 0.6,
    ncores = parallel::detectCores() - 4
  )

GCLr::save_objects("duplicate_check_95", path = "objects", rds = TRUE)
```

How many duplicate pairs by silly?
```{r}
duplicate_check_95 %>% 
  dplyr::count(silly) %>% 
  dplyr::arrange(dplyr::desc(n))
```

Remove **both** duplicates! As opposed to GSI work, where we want to keep individuals but aren’t typically worried about paired data, here we want to remove both individuals as the paired data integrity (including otolith reads!) is lost.
```{r}
duplicates_removed <-
  GCLr::remove_dups(dupcheck = duplicate_check_95, remove_both = TRUE)

GCLr::save_objects("duplicates_removed", path = "objects", rds = TRUE)

(
  sample_size_qa <- sample_size_qa %>%
    dplyr::mutate(
      duplicate = genotyped - missing - heterozygosity - GCLr::silly_n(sillyvec = sillyvec) %>% dplyr::pull(n)
    )
)
```

Overall `r format(sum(sample_size_qa$duplicate), big.mark = ",")` samples dropped due to duplicate genotypes. 

## Final

Final, post-QA sample sizes by silly.
```{r}
(
  sample_size_qa <- sample_size_qa %>%
    dplyr::mutate(
      final =  GCLr::silly_n(sillyvec = sillyvec) %>% dplyr::pull(n)
    )
)

GCLr::save_objects("sample_size_qa", path = "objects", rds = TRUE)

fs::dir_create("output")
readr::write_csv(x = sample_size_qa, file = "output/sample_size_qa.csv")
```

# Save

Save final genotypes and update metadata.

## Genotypes

Save post-QA genotypes with joined metadata.
```{r}
fs::dir_create("data/genotypes/postQA")
GCLr::save_sillys(sillyvec = sillyvec, path = "data/genotypes/postQA", rds = TRUE)
```

## Metadata

add `pass_QA`
```{r}
postQA_silly_source <- sapply(sillyvec, function(silly) {get(paste0(silly, ".gcl"))$SillySource}) %>% unlist()

metadata <- metadata %>% 
  dplyr::mutate(pass_QA = silly_source %in% postQA_silly_source)
```

Confirm.
```{r}
metadata %>% 
  dplyr::count(genotyped, pass_QA)
```

**NOTE** that `genotyped` in `sample_size_qa` doesn't match `genotyped` in `metadata` because I pre-filtered out QC Crosscheck plates.
```{r}
sample_size_qa %>% 
  janitor::adorn_totals() %>% 
  dplyr::filter(silly == "Total")
```

# QA Conclusions

Overall `r format(sum(metadata$genotyped), big.mark = ",")` total pedigree samples were genotyped; `r format(sum(metadata$genotyped) - sum(sample_size_qa$genotyped), big.mark = ",")` were removed due to pending QC Crosschecks; `r format(sum(sample_size_qa$missing), big.mark = ",")` were removed due to poor quality DNA (missing >= 20% loci); `r format(sum(sample_size_qa$heterozygosity), big.mark = ",")` were removed due to contaminated DNA (heterozygosity >= 3.5 modified z-score); and `r format(sum(sample_size_qa$duplicate), big.mark = ",")` were removed due to duplicate genotypes (>= 95% loci), leaving a total of **`r format(sum(sample_size_qa$final), big.mark = ",")`** in the final dataset.

# Adaptive Loci Genotypes

Create a single object with just the metadata and adaptive loci genotypes.
```{r}
(
  chum_adaptive_geno <- sapply(sillyvec, function(silly) {
    get(paste0(silly, ".gcl"))
  }, simplify = FALSE) %>%
    dplyr::bind_rows() %>%
    dplyr::select(
      silly_source = SillySource,
      plate_id = PLATE_ID,
      stream:sw_age,
      dplyr::contains("OkeV2")
    )
)
```

## Calculate Allele Frequencies

Calculate the allele counts for each locus by stream, year, and DOY.
Adapted from Pat Barry's code in `AHRP_ChumAdaptive.html`
```{r}
loci_adaptive <- grep(pattern = "OkeV2", x = loci251, value = TRUE)

# testing
# chum_adaptive_geno %>%
#   dplyr::select(silly_source, stream, year, DOY, tidyselect::starts_with(loci_adaptive[1])) %>%
#   tidyr::pivot_longer(cols = tidyselect::starts_with(loci_adaptive[1]), names_to = "locus", values_to = "allele") %>%
#   dplyr::mutate(allele = factor(allele),
#                 allele = as.numeric(allele)) %>%  # make factor and convert to numeric
#   dplyr::count(stream, year, DOY, allele) %>%
#   dplyr::mutate(locus = loci_adaptive[1])

# calculate the number of each allele observed per stream, year, day
(chum_adaptive_allele_counts <- lapply(loci_adaptive, function(locus) {
  chum_adaptive_geno %>%
    dplyr::select(silly_source,
                  stream,
                  year,
                  DOY,
                  tidyselect::starts_with(locus)) %>%
    tidyr::pivot_longer(
      cols = tidyselect::starts_with(locus),
      names_to = "locus",
      values_to = "allele"
    ) %>%
    dplyr::mutate(allele = factor(allele),
                allele = as.numeric(allele)) %>%  # make factor and convert to numeric
    dplyr::count(stream, year, DOY, allele) %>%
    dplyr::mutate(locus = locus)
}) %>%
    dplyr::bind_rows())
```

Convert to alelle frequencies.
```{r}
(
  chum_adaptive_allele_freq <- chum_adaptive_allele_counts %>%
    tidyr::pivot_wider(names_from = allele, values_from = n) %>%
    dplyr::select(-`NA`) %>%  # drop no-calls
    dplyr::mutate(
      `1` = replace_na(`1`, 0),
      # account for unobserved alleles on a given day
      `2` = replace_na(`2`, 0)
    ) %>%
    tidyr::pivot_longer(
      cols = `1`:`2`,
      names_to = "allele",
      values_to = "n"
    ) %>%
    dplyr::group_by(stream, year, DOY, locus) %>%
    dplyr::mutate(
      n_alleles = sum(n),
      p_alleles = n / n_alleles,
      SNP = paste(locus, allele, sep = "_")
    ) %>% 
    dplyr::ungroup()
)
```

## View Allele Frequencies

```{r}
#  TODO
# double check allele frequency code to make sure it is correct
# make sure alleles are being assigned 1 vs. 2 consistently
# filter out hatchery strays?
# check sample sizes by stream/year/DOY?
# see if spawning state correlates?
```

### Admiralty

```{r fig.height=10}
chum_adaptive_allele_freq %>% 
  dplyr::filter(stream == "Admiralty") %>% 
  dplyr::mutate(locus = stringr::str_remove_all(string = locus, pattern = "OkeV2_")) %>% 
  ggplot2::ggplot(ggplot2::aes(x = DOY, y = p_alleles, colour = SNP, group_by(SNP))) +
  ggplot2::geom_point() +
  ggplot2::geom_smooth(method = "lm", weight = "n_alleles", formula = y~x) +
  ggplot2::facet_grid(locus ~ year) + 
  ggplot2::theme_bw() + 
  ggplot2::theme(legend.position = 'none')
```

### Fish

```{r fig.height=10}
chum_adaptive_allele_freq %>% 
  dplyr::filter(stream == "Fish - Douglas Island") %>% 
  dplyr::mutate(locus = stringr::str_remove_all(string = locus, pattern = "OkeV2_")) %>% 
  ggplot2::ggplot(ggplot2::aes(x = DOY, y = p_alleles, colour = SNP, group_by(SNP))) +
  ggplot2::geom_point() +
  ggplot2::geom_smooth(method = "lm", weight = "n_alleles", formula = y~x) +
  ggplot2::facet_grid(locus ~ year) + 
  ggplot2::theme_bw() + 
  ggplot2::theme(legend.position = 'none')
```

### Prospect

```{r fig.height=10}
chum_adaptive_allele_freq %>% 
  dplyr::filter(stream == "Prospect") %>% 
  dplyr::mutate(locus = stringr::str_remove_all(string = locus, pattern = "OkeV2_")) %>% 
  ggplot2::ggplot(ggplot2::aes(x = DOY, y = p_alleles, colour = SNP, group_by(SNP))) +
  ggplot2::geom_point() +
  ggplot2::geom_smooth(method = "lm", weight = "n_alleles", formula = y~x) +
  ggplot2::facet_grid(locus ~ year) + 
  ggplot2::theme_bw() + 
  ggplot2::theme(legend.position = 'none')
```

### Sawmill

```{r fig.height=10}
chum_adaptive_allele_freq %>% 
  dplyr::filter(stream == "Sawmill") %>% 
  dplyr::mutate(locus = stringr::str_remove_all(string = locus, pattern = "OkeV2_")) %>% 
  ggplot2::ggplot(ggplot2::aes(x = DOY, y = p_alleles, colour = SNP, group_by(SNP))) +
  ggplot2::geom_point() +
  ggplot2::geom_smooth(method = "lm", weight = "n_alleles", formula = y~x) +
  ggplot2::facet_grid(locus ~ year) + 
  ggplot2::theme_bw() + 
  ggplot2::theme(legend.position = 'none')
```

end